library(tidyverse)
library(haven)
library(grf)
library(glmnet)
library(bartMachine)
library(ggplot2)
library(ggthemes)

## Old Set
students=read_sav("STAR_Students.sav")
set.seed(1)
studentrestricted=students%>%filter(yearsstar==4, yearssmall %in% c(0,4), g1classtype==g2classtype,g2classtype==g3classtype,gkclasstype==g1classtype,
                                    g1schid==g2schid,g2schid==g3schid,gkschid==g1schid)
studentrestricted=studentrestricted%>%filter(g1classtype %in% c(1,2),flagg3==1)
studentg3achievements=studentrestricted[c("g3treadss","g3tmathss","g3tlangss","g3readbsraw","g3mathbsraw")]
studentothers=studentrestricted[,-c(grep("^gk",names(studentrestricted)),grep("^g[1-8]",names(studentrestricted)),
                                    grep("^hs",names(studentrestricted)))]
datalim=cbind(studentg3achievements,studentothers,studentrestricted$gkclasstype)
names(datalim)[length(names(datalim))]<-"classsize"
datalim=cbind(datalim,studentrestricted$g1schid)
names(datalim)[length(names(datalim))]<-"schid"
datalim=datalim[complete.cases(datalim),]
train_samples=sample(1:nrow(datalim),round(0.7*nrow(datalim)),replace = FALSE)
trainset=datalim[train_samples,]
testset=datalim[-train_samples,]
write_csv(trainset,"limited_training_set.csv")
write_csv(testset,"limited_testing_set.csv")

## New Set
students=read_sav("STAR_Students.sav")
set.seed(1)
studentrestricted=students
studentrestricted=studentrestricted%>%filter(gkclasstype %in% c(1,2),flagg3==1)
studentg3achievements=studentrestricted[c("g3treadss","g3tmathss","g3tlangss")]
studentothers=studentrestricted[,-c(grep("^gk",names(studentrestricted)),grep("^g[1-8]",names(studentrestricted)),
                                    grep("^hs",names(studentrestricted)))]
datalim=cbind(studentg3achievements,studentothers,studentrestricted$gkclasstype)
names(datalim)[length(names(datalim))]<-"classsize"
datalim=cbind(datalim,studentrestricted$gkschid)
names(datalim)[length(names(datalim))]<-"schid"
datalim<-datalim%>%select(g3treadss,g3tmathss,g3tlangss,stdntid,gender,race,birthmonth,birthyear,classsize,schid)
datalim=datalim[complete.cases(datalim),]
schools=read_sav("STAR_K-3_Schools.sav")
schools=schools%>%select(schid,SCHLURBN,GRDRANGE,GKENRMNT,GKFRLNCH,GKBUSED,GKWHITE)
schools$GKWHITE[is.na(schools$GKWHITE)]=0
schools=schools[complete.cases(schools),]
totalsetexp=left_join(datalim,schools)
totalsetexp=totalsetexp[complete.cases(totalsetexp),]
totalsetexp=totalsetexp%>%filter(birthyear>1978,race<3,birthyear<1981)



train_samples=sample(1:nrow(totalsetexp),round(0.7*nrow(totalsetexp)),replace = FALSE)
trainset=totalsetexp[train_samples,]
testset=totalsetexp[-train_samples,]
write_csv(trainset,"limited_training_set_new.csv")
write_csv(testset,"limited_testing_set_new.csv")